1 Libraries

Carico le librerie

# Import libraries
pkg <- installed.packages()

if (!"DBI" %in% pkg[,1]) install.packages("DBI")
if (!"RPostgres" %in% pkg[,1]) install.packages("RPostgres")
if (!"dplyr" %in% pkg[,1]) install.packages("dplyr")
if (!"plotly" %in% pkg[,1]) install.packages("plotly")
if (!"caret" %in% pkg[,1]) install.packages("caret")
if (!"e1071" %in% pkg[,1]) install.packages("e1071")
if (!"glmnet" %in% pkg[,1]) install.packages("glmnet")
if (!"randomForest" %in% pkg[,1]) install.packages("randomForest")

require(DBI)
require(RPostgres)
require(dplyr)
require(plotly)
require(caret)

2 Import data

Connessione al db

# Connecting to db
con = dbConnect(
  Postgres(), 
  user = 'my_user',
  password = '&my_passw',
  dbname = 'my_db',
  host = 'my_host',
  port = 25060,
  sslmode = 'require'
)

Importo i dati in R

regressors_query <- dbSendQuery(con, "SELECT * FROM ccpp.regressors")
regressors <- dbFetch(regressors_query)
dbClearResult(regressors_query)

target_query <- dbSendQuery(con, "SELECT * FROM ccpp.target")
target <- dbFetch(target_query)
dbClearResult(target_query)

# Closing connection
dbDisconnect(con)

3 Esplorazione dati

3.1 Dimensioni e intestazione:

head(regressors)
id at v ap rh
0 8.34 40.77 1010.84 90.01
1 23.64 58.49 1011.40 74.20
2 29.74 56.90 1007.15 41.91
3 19.07 49.69 1007.22 76.79
4 11.80 40.66 1017.13 97.20
5 13.97 39.16 1016.05 84.60
head(target)
id pe
0 480.48
1 NA
2 438.76
3 453.09
4 464.43
5 470.96
dim(target)
## [1] 9565    2
dim(regressors)
## [1] 9565    5

3.2 Join

Merge/join delle tabelle regressors e target attraverso la chiave univoca ID:

data <- dplyr::inner_join(regressors, target, by = 'id')
head(data)
id at v ap rh pe
0 8.34 40.77 1010.84 90.01 480.48
1 23.64 58.49 1011.40 74.20 NA
2 29.74 56.90 1007.15 41.91 438.76
3 19.07 49.69 1007.22 76.79 453.09
4 11.80 40.66 1017.13 97.20 464.43
5 13.97 39.16 1016.05 84.60 470.96

3.3 Analisi

Analisi delle variabili covariate e target, distribuzione e valori mancanti (missing value):

summary(data[,-1])
##        at              v               ap                 rh        
##  Min.   : 1.81   Min.   :25.36   Min.   :   992.9   Min.   : 25.56  
##  1st Qu.:13.51   1st Qu.:41.74   1st Qu.:  1009.1   1st Qu.: 63.34  
##  Median :20.34   Median :52.08   Median :  1013.0   Median : 74.98  
##  Mean   :19.65   Mean   :54.30   Mean   :  1023.7   Mean   : 73.31  
##  3rd Qu.:25.72   3rd Qu.:66.54   3rd Qu.:  1017.3   3rd Qu.: 84.84  
##  Max.   :37.11   Max.   :81.56   Max.   :100499.0   Max.   :100.16  
##  NA's   :1                                                          
##        pe          
##  Min.   :   420.3  
##  1st Qu.:   439.8  
##  Median :   451.6  
##  Mean   :   501.1  
##  3rd Qu.:   468.4  
##  Max.   :447150.0  
##  NA's   :1

Le variabili at e pe contengono 1 valore mancante ciascuna. Le variabili ap e pe hanno un valore massimo decisamente superiore alma media e 3° quartile: analizziamone la distribuzione per determinare la possibile presenza di outlier.

par(mfrow=c(1,2))
p1<-plot(data$ap, main = 'Plot ap')
p2<-hist(data$ap, main = 'Histogram ap')

p1<-plot(data$pe, main = 'Plot pe')
p2<-hist(data$pe, main = 'Histogram pe')

par(mfrow=c(1,1))

Dai grafici deduco che il regressore ap e la variabile dipendente pe hanno rispettivamente un valore anomalo outlier: decido di eslcudere questi valori dal dataset che utilizzerò per il modello.

Rimozione valori anomali e missing:

pos_NA <- c(which(is.na(data$at)),which(is.na(data$pe)))
pos_outlier <- c(which.max(data$ap),which.max(data$pe))

data_clean <- data[-c(pos_NA, pos_outlier),-1]
summary(data_clean)
##        at              v               ap               rh        
##  Min.   : 1.81   Min.   :25.36   Min.   : 992.9   Min.   : 25.56  
##  1st Qu.:13.51   1st Qu.:41.74   1st Qu.:1009.1   1st Qu.: 63.34  
##  Median :20.34   Median :52.08   Median :1013.0   Median : 74.98  
##  Mean   :19.65   Mean   :54.30   Mean   :1013.3   Mean   : 73.32  
##  3rd Qu.:25.72   3rd Qu.:66.54   3rd Qu.:1017.3   3rd Qu.: 84.84  
##  Max.   :37.11   Max.   :81.56   Max.   :1033.3   Max.   :100.16  
##        pe       
##  Min.   :420.3  
##  1st Qu.:439.8  
##  Median :451.6  
##  Mean   :454.4  
##  3rd Qu.:468.4  
##  Max.   :495.8

Matrice di correlazione di Pearson

cor(data_clean)
##            at          v          ap          rh         pe
## at  1.0000000  0.8442329 -0.50740590 -0.54252332 -0.9481211
## v   0.8442329  1.0000000 -0.41378740 -0.31236277 -0.8698600
## ap -0.5074059 -0.4137874  1.00000000  0.09913437  0.5183374
## rh -0.5425233 -0.3123628  0.09913437  1.00000000  0.3896975
## pe -0.9481211 -0.8698600  0.51833742  0.38969749  1.0000000

L’indice di correlazione suggerisce che sussiste un grado di dipendeza, talvolta forte, dei regressori con la variabile dipendente pe. La variabile più correlata con il target è il regressore at, tramite una forte dipendenza negativa.

4 Plot

Scatter plot dei regressori vs variabile dipendente:

plot_ly(data_clean, x=~at, y=~pe, mode = 'markers', type = 'scatter')
plot_ly(data_clean, x=~v, y=~pe, mode = 'markers', type = 'scatter')
plot_ly(data_clean, x=~ap, y=~pe, mode = 'markers', type = 'scatter')
plot_ly(data_clean, x=~rh, y=~pe, mode = 'markers', type = 'scatter')

5 Models

5.1 Variable selection

Dato che le covariate sono 4 e le osservazioni 9558, i gradi di libertà dei modelli saranno elevati. Di conseguenza non risulterà necessario effettuare un processo di variable selection.

5.2 Model selection

5.2.1 Intro

Dal momento che la variabile dipendente pe è quantitativa continua, il problema di fitting è riconducile alla classe dei modelli supervisionati applicabili a contesti di regressione.

5.2.2 Cross-Validation

Pirma di testare i modelli è necessario: * definire la metrica da minimizzare e utilizzare per confrontare il fit del modelli: RMSE * dotarsi di un metodo di train/test che consenta di evitare il problema dell’overfitting e verificare che il modello si comporti al meglio in contesti predittivi oltre che esplicativi.

Scelgo di applicare una k-cross-validation (non ripetuta) con 5 folds: il dataset sarà diviso in 5 gruppi ed il modello sarà stimato (train) su 4 gruppi (80%) e poi applicato (test) su di 1 gruppo (20%). COn questo metodo tutte le osservazioni sono utilizzate sia per stimare il modello che per controllare il fit.

fitControl <- trainControl(method = "cv", number = 5)

5.2.3 Basic linear model

Come primo step vediamo come si comporta la regressione lineare multipla: questo modello sarà usato come benchmark per valutare le performance di modelli di ML più sofisticati:

set.seed(8848)
lm1 <- train(pe~., data = data_clean, trControl=fitControl, method = 'lm')
lm1
## Linear Regression 
## 
## 9558 samples
##    4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647 
## Resampling results:
## 
##   RMSE    Rsquared   MAE     
##   4.5614  0.9287105  3.630108
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

5.2.4 ML Methods

Generalized Additive Model

# Could be slow
set.seed(8848)
gam1 <- train(pe~., data = data_clean, trControl=fitControl, method = 'gam')
gam1
## Generalized Additive Model using Splines 
## 
## 9558 samples
##    4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647 
## Resampling results across tuning parameters:
## 
##   select  RMSE      Rsquared   MAE     
##   FALSE   4.161785  0.9405765  3.226169
##    TRUE   4.161912  0.9405721  3.226565
## 
## Tuning parameter 'method' was held constant at a value of GCV.Cp
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were select = FALSE and method
##  = GCV.Cp.

Boosted Generalized Additive Model

set.seed(8848)
# Could be slow
gamboost1 <- train(pe~., data = data_clean, trControl=fitControl, method = 'gamboost')
gamboost1
## Boosted Generalized Additive Model 
## 
## 9558 samples
##    4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647 
## Resampling results across tuning parameters:
## 
##   mstop  RMSE      Rsquared   MAE     
##    50    4.407041  0.9344974  3.480719
##   100    4.268768  0.9376009  3.358154
##   150    4.239975  0.9383568  3.327934
## 
## Tuning parameter 'prune' was held constant at a value of no
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mstop = 150 and prune = no.

Lasso regression

set.seed(8848)
lasso1 <- train(pe~., data = data_clean, trControl=fitControl, method = 'lasso')
lasso1
## The lasso 
## 
## 9558 samples
##    4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647 
## Resampling results across tuning parameters:
## 
##   fraction  RMSE       Rsquared   MAE      
##   0.1       15.149830  0.8990594  13.144539
##   0.5        8.067583  0.9079672   6.732064
##   0.9        4.661088  0.9264860   3.717352
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.

Ridge regression

set.seed(8848)
ridge1 <- train(pe~., data = data_clean, trControl=fitControl, method = 'ridge')
ridge1
## Ridge Regression 
## 
## 9558 samples
##    4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE      Rsquared   MAE     
##   0e+00   4.561400  0.9287105  3.630108
##   1e-04   4.561401  0.9287104  3.630130
##   1e-01   4.873008  0.9200824  3.838092
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.

Polynomial Regression Faccio variare i gradi del polinomio

# Could be slow
model_poly <- as.data.frame(matrix(0, ncol = 5, nrow = 0))
names(model_poly) <- c("i","j","k","h","RMSE")
r<-1

for(i in 1:3) {
  for(j in 1:3){
    for(k in 1:3){
      for(h in 1:3){
  f_name <-paste0("pe~poly(at,",i,")+poly(v,",j,")+poly(ap,",k,")+poly(rh,",h,")")
  form <- formula(f_name)
  set.seed(8848)
  model <- train(form, data = data_clean, trControl=fitControl, method = 'lm')
  model_poly[r,"i"] <- i
  model_poly[r,"j"] <- j
  model_poly[r,"h"] <- h
  model_poly[r,"k"] <- k
  model_poly[r,"RMSE"] <- model$results$RMSE
  r <- r+1
      }
    }
  }
}

model_poly[which.min(model_poly$RMSE),]
i j k h RMSE
81 3 3 3 3 4.245404

Linear Model with Interactions

set.seed(8848)
int1 <- train(pe~at*v*ap*rh, data = data_clean, trControl=fitControl, method = 'lm')
int1
## Linear Regression 
## 
## 9558 samples
##    4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.272396  0.9374021  3.337471
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Random Forest Con CV il modello non è ottimizzato: utilizzo un approccio train-test 80-20 classico.

# Could be slow
seq <- seq(1,nrow(data_clean),1)
s <- sample(seq,as.integer(nrow(data_clean)*0.8))
data_train_rf <- data_clean[s,]
data_test_rf <- data_clean[-s,]

set.seed(8848)
rf1 <- randomForest::randomForest(pe~., data = data_train_rf, ntree = 200, mtry = 3, importance = TRUE) 
data_test_rf$predict <-predict(rf1, data_test_rf)
data_test_rf$residuals <- data_test_rf$predict - data_test_rf$pe
rf_RMSE <- sqrt(mean((data_test_rf$residuals)^2))

5.2.5 Final Model

Scelgo il modello che minimizza RMSE:

model_results <- as.data.frame(rbind(
  cbind(lm1$results$RMSE, "lm1","linear model"),
  cbind(gamboost1$results$RMSE, "gamboost1", "gamboost"),
  cbind(gam1$results$RMSE, "gam1", "gam"),
  cbind(lasso1$results$RMSE, "lasso1","lasso"),
  cbind(ridge1$results$RMSE, "ridge1","ridge"),
  cbind(model_poly[which.min(model_poly$RMSE),"RMSE"], "model_poly", "Polynomial lm"),
  cbind(int1$results$RMSE, "int1", "Lm with interactions"),
  cbind(rf_RMSE, "rf1", "random forest")
))
names(model_results) <- c("RMSE", "model", "type")
model_results$RMSE <- gsub(" ", "",model_results$RMSE)
pos_best <- which.min(as.numeric(model_results$RMSE))
model_results[pos_best,]
RMSE model type
15 3.61148263174888 rf1 random forest

Summary del modello finale:

get(as.character(model_results[pos_best,"model"]))
## 
## Call:
##  randomForest(formula = pe ~ ., data = data_train_rf, ntree = 200,      mtry = 3, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 200
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 10.2698
##                     % Var explained: 96.46

Importanza delle variabili:

get(as.character(model_results[pos_best,"model"]))$importance
##       %IncMSE IncNodePurity
## at 374.718235    1633981.74
## v   50.353594     496026.18
## ap  10.087611      42122.85
## rh   9.093884      37690.78

5.3 Conclusioni

Ho scelto il modello che minimizza RMSE. Avendo utilizzato un approccio di model selection robusto, non incorro in problemi di overfitting. L’tilizzo di un modello di ML migliora le performance predittive del 0.21% rispetto al benchmark (regressione lineare multipla), che già godeva di un buon fit ( R-squared = 0.9287105)